Module import¶

In [ ]:
import pandas as pd
import numpy as np

from prophet import Prophet
from prophet.plot import add_changepoints_to_plot

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import plotly.express as px

import pickle

from dateutil.parser import *

import warnings
warnings.filterwarnings('ignore')
/home/naru/.pyenv/versions/3.9.13/envs/py3.9.13/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

파일 열기¶

In [ ]:
import os
# from datetime import datetime, timedelta

# def date_range(start, end):
#     start = datetime.strptime(start, "%Y%m%d")
#     end = datetime.strptime(end, "%Y%m%d")
#     dates = [(start + timedelta(days=i)).strftime("%Y%m%d") for i in range((end-start).days+1)]
#     return dates

# dates = date_range("20230322", "20230620")
# # print(dates)

date = '20230322'

os.chdir('./data/TCS_TCS원시자료_1일_1일_{}'.format(date))
# print(os.getcwd())
# print(os.listdir())

origin_data = pd.read_csv('/mnt/c/Users/user/source/ai_plus/real_data/data/TCS_TCS원시자료_1일_1일_{0}/{1}'.format(date, os.listdir()[0]), encoding='euc-kr')
origin_data
Out[ ]:
출구본부명 출구지사명 출구영업소코드 출구영업소명 처리일자 처리일시분초 TCS차종구분코드 TCS차종구분명 근무일자 근무번호 확인순번 TCS본부명 지사명 영업소코드 영업소명 발급일시 발급시분초 Unnamed: 17
0 강원본부 홍천 217 북원주 20230322 60406 1 1종 20230322 3601 7 강원본부 홍천 174 춘천 322 52400 NaN
1 대구경북본부 구미 126 왜관 20230322 60429 1 1종 20230322 3601 78 대구경북본부 대구 129 북대구 322 55200 NaN
2 대구경북본부 대구 135 경주 20230322 60438 4 4종 20230322 3501 35 대구경북본부 구미 126 왜관 20230322 44900 NaN
3 대구경북본부 구미 121 추풍령 20230322 60448 6 6종 20230322 3101 5 대구경북본부 구미 121 추풍령 321 195000 NaN
4 광주전남본부 순천 271 순천 20230322 60505 6 6종 20230322 3801 52 광주전남본부 담양 569 북광주 322 51300 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
403017 제2서해안고속도로 평택시흥 683 송산마도 20230322 75729 1 1종 20230322 202 404 제2서해안고속도로 평택시흥 685 서시흥 322 72900 NaN
403018 제2서해안고속도로 평택시흥 683 송산마도 20230322 75738 1 1종 20230322 202 405 제2서해안고속도로 평택시흥 685 서시흥 322 73400 NaN
403019 제2서해안고속도로 평택시흥 683 송산마도 20230322 75755 1 1종 20230322 202 406 제2서해안고속도로 평택시흥 685 서시흥 322 72900 NaN
403020 광주전남본부 순천 272 광양 20230322 53826 5 5종 20230322 3801 30 부산경남본부 창원 245 장유 322 40300 NaN
403021 수도권본부 화성 283 발안 20230322 53830 1 1종 20230322 3701 46 수도권본부 군포 254 군자 322 51400 NaN

403022 rows × 18 columns

In [ ]:
data = origin_data[['출구지사명', '출구영업소명', '발급일시', '발급시분초']]
data
Out[ ]:
출구지사명 출구영업소명 발급일시 발급시분초
0 홍천 북원주 322 52400
1 구미 왜관 322 55200
2 대구 경주 20230322 44900
3 구미 추풍령 321 195000
4 순천 순천 322 51300
... ... ... ... ...
403017 평택시흥 송산마도 322 72900
403018 평택시흥 송산마도 322 73400
403019 평택시흥 송산마도 322 72900
403020 순천 광양 322 40300
403021 화성 발안 322 51400

403022 rows × 4 columns

In [ ]:
data = origin_data[['출구영업소명', '처리일자', '처리일시분초']]
data
Out[ ]:
출구영업소명 처리일자 처리일시분초
0 북원주 20230322 60406
1 왜관 20230322 60429
2 경주 20230322 60438
3 추풍령 20230322 60448
4 순천 20230322 60505
... ... ... ...
403017 송산마도 20230322 75729
403018 송산마도 20230322 75738
403019 송산마도 20230322 75755
403020 광양 20230322 53826
403021 발안 20230322 53830

403022 rows × 3 columns

dtype to date¶

In [ ]:
data['처리일자'] = data['처리일자'].astype(str)
data['처리일시분초'] = data['처리일시분초'].astype(str)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403022 entries, 0 to 403021
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   출구영업소명  403022 non-null  object
 1   처리일자    403022 non-null  object
 2   처리일시분초  403022 non-null  object
dtypes: object(3)
memory usage: 9.2+ MB
In [ ]:
data['처리일시분초'] = data['처리일시분초'].apply( lambda x: x.zfill(6) )
data
Out[ ]:
출구영업소명 처리일자 처리일시분초
0 북원주 20230322 060406
1 왜관 20230322 060429
2 경주 20230322 060438
3 추풍령 20230322 060448
4 순천 20230322 060505
... ... ... ...
403017 송산마도 20230322 075729
403018 송산마도 20230322 075738
403019 송산마도 20230322 075755
403020 광양 20230322 053826
403021 발안 20230322 053830

403022 rows × 3 columns

In [ ]:
# data['ds'] = data[['처리일자', '처리일시분초']].ap
data['ds'] = data[['처리일자', '처리일시분초']].apply( ' '.join, axis=1 )
data['ds'] = pd.to_datetime(data.ds)
In [ ]:
data.columns = ['user', '처리일자', '처리일시분초', 'ds']
# data.head()
In [ ]:
data = data[['user', 'ds']]
data['y'] = 1
# len(data.user.unique())
data
Out[ ]:
user ds y
0 북원주 2023-03-22 06:04:06 1
1 왜관 2023-03-22 06:04:29 1
2 경주 2023-03-22 06:04:38 1
3 추풍령 2023-03-22 06:04:48 1
4 순천 2023-03-22 06:05:05 1
... ... ... ...
403017 송산마도 2023-03-22 07:57:29 1
403018 송산마도 2023-03-22 07:57:38 1
403019 송산마도 2023-03-22 07:57:55 1
403020 광양 2023-03-22 05:38:26 1
403021 발안 2023-03-22 05:38:30 1

403022 rows × 3 columns

In [ ]:
top10user = data.groupby(by='user').count().sort_values(by='ds', ascending=False).head(10)
users = top10user.reset_index()['user'].unique()
users
Out[ ]:
array(['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안'],
      dtype=object)
In [ ]:
user_dfs = [ data[data.user == user][['ds','y']].sort_values('ds').reset_index( drop=True) for user in users ]    
In [ ]:
users_dict = dict( zip( users, user_dfs ))
users_dict.keys()
Out[ ]:
dict_keys(['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안'])
In [ ]:
# users_dict['서서울']
In [ ]:
temp = users_dict['서서울'].set_index('ds').resample( '1T' ).count().reset_index()
temp
Out[ ]:
ds y
0 2023-03-22 00:01:00 2
1 2023-03-22 00:02:00 1
2 2023-03-22 00:03:00 1
3 2023-03-22 00:04:00 2
4 2023-03-22 00:05:00 2
... ... ...
1434 2023-03-22 23:55:00 4
1435 2023-03-22 23:56:00 3
1436 2023-03-22 23:57:00 1
1437 2023-03-22 23:58:00 1
1438 2023-03-22 23:59:00 2

1439 rows × 2 columns

In [ ]:
# px.line( temp, x='ds', y='y', title='Train data' ).show()
px.scatter( temp, x='ds', y='y', title='Train data' ).show()

prepocessing - 0 for each file¶

In [ ]:
# user_dfs = [ data[data.user == user][['ds','y']].sort_values('ds').reset_index( drop=True) for user in users ]    
users = ['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안']
# origin_data
In [ ]:
origin_data[origin_data['출구영업소명'] == '서서울'].index
Out[ ]:
7843
In [ ]:
# indexes = [ list(origin_data[origin_data['출구영업소명'] == user].index) for user in users ]
# indexes = sum( indexes, [] )
indexes = sum( [ list(origin_data[origin_data['출구영업소명'] == user].index) for user in users ], [] )


origin_data.loc[indexes, ]['출구영업소명'].unique()
    
Out[ ]:
array(['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안'],
      dtype=object)
In [ ]:
top10_data = origin_data.loc[indexes, ]
top10_data.shape, origin_data.shape
Out[ ]:
((52926, 18), (403022, 18))
In [ ]:
top10_data.to_csv('../../sample/sample.csv', header=True )

분산 데이터 통으로 불러와서 편집하기¶

In [ ]:
import os

file_list = [ '/'.join(('./data', dir, os.listdir( f'./data/{dir}')[0])) for dir in os.listdir( './data') ]
print( len(file_list) )
file_list
91
Out[ ]:
['./data/TCS_TCS원시자료_1일_1일_20230322/TCS_17_04_01_572191.csv',
 './data/TCS_TCS원시자료_1일_1일_20230323/TCS_17_04_01_594836.csv',
 './data/TCS_TCS원시자료_1일_1일_20230324/TCS_17_04_01_511825.csv',
 './data/TCS_TCS원시자료_1일_1일_20230325/TCS_17_04_01_454206.csv',
 './data/TCS_TCS원시자료_1일_1일_20230326/TCS_17_04_01_997192.csv',
 './data/TCS_TCS원시자료_1일_1일_20230327/TCS_17_04_01_459303.csv',
 './data/TCS_TCS원시자료_1일_1일_20230328/TCS_17_04_01_479567.csv',
 './data/TCS_TCS원시자료_1일_1일_20230329/TCS_17_04_01_698049.csv',
 './data/TCS_TCS원시자료_1일_1일_20230330/TCS_17_04_01_102969.csv',
 './data/TCS_TCS원시자료_1일_1일_20230331/TCS_17_04_01_328531.csv',
 './data/TCS_TCS원시자료_1일_1일_20230401/TCS_17_04_01_585985.csv',
 './data/TCS_TCS원시자료_1일_1일_20230402/TCS_17_04_01_314188.csv',
 './data/TCS_TCS원시자료_1일_1일_20230403/TCS_17_04_01_454817.csv',
 './data/TCS_TCS원시자료_1일_1일_20230404/TCS_17_04_01_208960.csv',
 './data/TCS_TCS원시자료_1일_1일_20230405/TCS_17_04_01_745475.csv',
 './data/TCS_TCS원시자료_1일_1일_20230406/TCS_17_04_01_373973.csv',
 './data/TCS_TCS원시자료_1일_1일_20230407/TCS_17_04_01_793664.csv',
 './data/TCS_TCS원시자료_1일_1일_20230408/TCS_17_04_01_936033.csv',
 './data/TCS_TCS원시자료_1일_1일_20230409/TCS_17_04_01_246528.csv',
 './data/TCS_TCS원시자료_1일_1일_20230410/TCS_17_04_01_374187.csv',
 './data/TCS_TCS원시자료_1일_1일_20230411/TCS_17_04_01_627491.csv',
 './data/TCS_TCS원시자료_1일_1일_20230412/TCS_17_04_01_980498.csv',
 './data/TCS_TCS원시자료_1일_1일_20230413/TCS_17_04_01_181829.csv',
 './data/TCS_TCS원시자료_1일_1일_20230414/TCS_17_04_01_537217.csv',
 './data/TCS_TCS원시자료_1일_1일_20230415/TCS_17_04_01_838312.csv',
 './data/TCS_TCS원시자료_1일_1일_20230416/TCS_17_04_01_208410.csv',
 './data/TCS_TCS원시자료_1일_1일_20230417/TCS_17_04_01_294167.csv',
 './data/TCS_TCS원시자료_1일_1일_20230418/TCS_17_04_01_178106.csv',
 './data/TCS_TCS원시자료_1일_1일_20230419/TCS_17_04_01_592455.csv',
 './data/TCS_TCS원시자료_1일_1일_20230420/TCS_17_04_01_690603.csv',
 './data/TCS_TCS원시자료_1일_1일_20230421/TCS_17_04_01_553941.csv',
 './data/TCS_TCS원시자료_1일_1일_20230422/TCS_17_04_01_698080.csv',
 './data/TCS_TCS원시자료_1일_1일_20230423/TCS_17_04_01_426892.csv',
 './data/TCS_TCS원시자료_1일_1일_20230424/TCS_17_04_01_539780.csv',
 './data/TCS_TCS원시자료_1일_1일_20230425/TCS_17_04_01_210760.csv',
 './data/TCS_TCS원시자료_1일_1일_20230426/TCS_17_04_01_113010.csv',
 './data/TCS_TCS원시자료_1일_1일_20230427/TCS_17_04_01_355113.csv',
 './data/TCS_TCS원시자료_1일_1일_20230428/TCS_17_04_01_569475.csv',
 './data/TCS_TCS원시자료_1일_1일_20230429/TCS_17_04_01_168462.csv',
 './data/TCS_TCS원시자료_1일_1일_20230430/TCS_17_04_01_438306.csv',
 './data/TCS_TCS원시자료_1일_1일_20230501/TCS_17_04_01_549424.csv',
 './data/TCS_TCS원시자료_1일_1일_20230502/TCS_17_04_01_511276.csv',
 './data/TCS_TCS원시자료_1일_1일_20230503/TCS_17_04_01_906155.csv',
 './data/TCS_TCS원시자료_1일_1일_20230504/TCS_17_04_01_845606.csv',
 './data/TCS_TCS원시자료_1일_1일_20230505/TCS_17_04_01_645771.csv',
 './data/TCS_TCS원시자료_1일_1일_20230506/TCS_17_04_01_810602.csv',
 './data/TCS_TCS원시자료_1일_1일_20230507/TCS_17_04_01_625537.csv',
 './data/TCS_TCS원시자료_1일_1일_20230508/TCS_17_04_01_975005.csv',
 './data/TCS_TCS원시자료_1일_1일_20230509/TCS_17_04_01_836542.csv',
 './data/TCS_TCS원시자료_1일_1일_20230510/TCS_17_04_01_448530.csv',
 './data/TCS_TCS원시자료_1일_1일_20230511/TCS_17_04_01_271156.csv',
 './data/TCS_TCS원시자료_1일_1일_20230512/TCS_17_04_01_125034.csv',
 './data/TCS_TCS원시자료_1일_1일_20230513/TCS_17_04_01_847956.csv',
 './data/TCS_TCS원시자료_1일_1일_20230514/TCS_17_04_01_145481.csv',
 './data/TCS_TCS원시자료_1일_1일_20230515/TCS_17_04_01_881374.csv',
 './data/TCS_TCS원시자료_1일_1일_20230516/TCS_17_04_01_539384.csv',
 './data/TCS_TCS원시자료_1일_1일_20230517/TCS_17_04_01_154911.csv',
 './data/TCS_TCS원시자료_1일_1일_20230518/TCS_17_04_01_990539.csv',
 './data/TCS_TCS원시자료_1일_1일_20230519/TCS_17_04_01_988952.csv',
 './data/TCS_TCS원시자료_1일_1일_20230520/TCS_17_04_01_104770.csv',
 './data/TCS_TCS원시자료_1일_1일_20230521/TCS_17_04_01_605243.csv',
 './data/TCS_TCS원시자료_1일_1일_20230522/TCS_17_04_01_273293.csv',
 './data/TCS_TCS원시자료_1일_1일_20230523/TCS_17_04_01_454634.csv',
 './data/TCS_TCS원시자료_1일_1일_20230524/TCS_17_04_01_134861.csv',
 './data/TCS_TCS원시자료_1일_1일_20230525/TCS_17_04_01_787377.csv',
 './data/TCS_TCS원시자료_1일_1일_20230526/TCS_17_04_01_926023.csv',
 './data/TCS_TCS원시자료_1일_1일_20230527/TCS_17_04_01_606250.csv',
 './data/TCS_TCS원시자료_1일_1일_20230528/TCS_17_04_01_772637.csv',
 './data/TCS_TCS원시자료_1일_1일_20230529/TCS_17_04_01_911282.csv',
 './data/TCS_TCS원시자료_1일_1일_20230530/TCS_17_04_01_563768.csv',
 './data/TCS_TCS원시자료_1일_1일_20230531/TCS_17_04_01_188543.csv',
 './data/TCS_TCS원시자료_1일_1일_20230601/TCS_17_04_01_299447.csv',
 './data/TCS_TCS원시자료_1일_1일_20230602/TCS_17_04_01_465834.csv',
 './data/TCS_TCS원시자료_1일_1일_20230603/TCS_17_04_01_632221.csv',
 './data/TCS_TCS원시자료_1일_1일_20230604/TCS_17_04_01_798608.csv',
 './data/TCS_TCS원시자료_1일_1일_20230605/TCS_17_04_01_937253.csv',
 './data/TCS_TCS원시자료_1일_1일_20230606/TCS_17_04_01_103579.csv',
 './data/TCS_TCS원시자료_1일_1일_20230607/TCS_17_04_01_214514.csv',
 './data/TCS_TCS원시자료_1일_1일_20230608/TCS_17_04_01_353160.csv',
 './data/TCS_TCS원시자료_1일_1일_20230609/TCS_17_04_01_491805.csv',
 './data/TCS_TCS원시자료_1일_1일_20230610/TCS_17_04_01_630451.csv',
 './data/TCS_TCS원시자료_1일_1일_20230611/TCS_17_04_01_769096.csv',
 './data/TCS_TCS원시자료_1일_1일_20230612/TCS_17_04_01_421582.csv',
 './data/TCS_TCS원시자료_1일_1일_20230613/TCS_17_04_01_532517.csv',
 './data/TCS_TCS원시자료_1일_1일_20230614/TCS_17_04_01_185003.csv',
 './data/TCS_TCS원시자료_1일_1일_20230615/TCS_17_04_01_295937.csv',
 './data/TCS_TCS원시자료_1일_1일_20230616/TCS_17_04_01_920712.csv',
 './data/TCS_TCS원시자료_1일_1일_20230617/TCS_17_04_01_545487.csv',
 './data/TCS_TCS원시자료_1일_1일_20230618/TCS_17_04_01_656422.csv',
 './data/TCS_TCS원시자료_1일_1일_20230619/TCS_17_04_01_767326.csv',
 './data/TCS_TCS원시자료_1일_1일_20230620/TCS_17_04_01_460341.csv']
In [ ]:
import os
import pandas as pd

file_list = [ '/'.join(('./data', dir, os.listdir( f'./data/{dir}')[0])) for dir in os.listdir( './data') ]

users = ['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안']

picking_data_files = []
for file in file_list:
    print( file, 'start~')
    df = pd.read_csv(file, encoding='euc-kr')
    indexes = sum( [ list(df[df['출구영업소명'] == user].index) for user in users ], [] )
    picking_data_files.append(df.loc[indexes, ])
./data/TCS_TCS원시자료_1일_1일_20230322/TCS_17_04_01_572191.csv start~
./data/TCS_TCS원시자료_1일_1일_20230323/TCS_17_04_01_594836.csv start~
./data/TCS_TCS원시자료_1일_1일_20230324/TCS_17_04_01_511825.csv start~
./data/TCS_TCS원시자료_1일_1일_20230325/TCS_17_04_01_454206.csv start~
./data/TCS_TCS원시자료_1일_1일_20230326/TCS_17_04_01_997192.csv start~
./data/TCS_TCS원시자료_1일_1일_20230327/TCS_17_04_01_459303.csv start~
./data/TCS_TCS원시자료_1일_1일_20230328/TCS_17_04_01_479567.csv start~
./data/TCS_TCS원시자료_1일_1일_20230329/TCS_17_04_01_698049.csv start~
./data/TCS_TCS원시자료_1일_1일_20230330/TCS_17_04_01_102969.csv start~
./data/TCS_TCS원시자료_1일_1일_20230331/TCS_17_04_01_328531.csv start~
./data/TCS_TCS원시자료_1일_1일_20230401/TCS_17_04_01_585985.csv start~
./data/TCS_TCS원시자료_1일_1일_20230402/TCS_17_04_01_314188.csv start~
./data/TCS_TCS원시자료_1일_1일_20230403/TCS_17_04_01_454817.csv start~
./data/TCS_TCS원시자료_1일_1일_20230404/TCS_17_04_01_208960.csv start~
./data/TCS_TCS원시자료_1일_1일_20230405/TCS_17_04_01_745475.csv start~
./data/TCS_TCS원시자료_1일_1일_20230406/TCS_17_04_01_373973.csv start~
./data/TCS_TCS원시자료_1일_1일_20230407/TCS_17_04_01_793664.csv start~
./data/TCS_TCS원시자료_1일_1일_20230408/TCS_17_04_01_936033.csv start~
./data/TCS_TCS원시자료_1일_1일_20230409/TCS_17_04_01_246528.csv start~
./data/TCS_TCS원시자료_1일_1일_20230410/TCS_17_04_01_374187.csv start~
./data/TCS_TCS원시자료_1일_1일_20230411/TCS_17_04_01_627491.csv start~
./data/TCS_TCS원시자료_1일_1일_20230412/TCS_17_04_01_980498.csv start~
./data/TCS_TCS원시자료_1일_1일_20230413/TCS_17_04_01_181829.csv start~
./data/TCS_TCS원시자료_1일_1일_20230414/TCS_17_04_01_537217.csv start~
./data/TCS_TCS원시자료_1일_1일_20230415/TCS_17_04_01_838312.csv start~
./data/TCS_TCS원시자료_1일_1일_20230416/TCS_17_04_01_208410.csv start~
./data/TCS_TCS원시자료_1일_1일_20230417/TCS_17_04_01_294167.csv start~
./data/TCS_TCS원시자료_1일_1일_20230418/TCS_17_04_01_178106.csv start~
./data/TCS_TCS원시자료_1일_1일_20230419/TCS_17_04_01_592455.csv start~
./data/TCS_TCS원시자료_1일_1일_20230420/TCS_17_04_01_690603.csv start~
./data/TCS_TCS원시자료_1일_1일_20230421/TCS_17_04_01_553941.csv start~
./data/TCS_TCS원시자료_1일_1일_20230422/TCS_17_04_01_698080.csv start~
./data/TCS_TCS원시자료_1일_1일_20230423/TCS_17_04_01_426892.csv start~
./data/TCS_TCS원시자료_1일_1일_20230424/TCS_17_04_01_539780.csv start~
./data/TCS_TCS원시자료_1일_1일_20230425/TCS_17_04_01_210760.csv start~
./data/TCS_TCS원시자료_1일_1일_20230426/TCS_17_04_01_113010.csv start~
./data/TCS_TCS원시자료_1일_1일_20230427/TCS_17_04_01_355113.csv start~
./data/TCS_TCS원시자료_1일_1일_20230428/TCS_17_04_01_569475.csv start~
./data/TCS_TCS원시자료_1일_1일_20230429/TCS_17_04_01_168462.csv start~
./data/TCS_TCS원시자료_1일_1일_20230430/TCS_17_04_01_438306.csv start~
./data/TCS_TCS원시자료_1일_1일_20230501/TCS_17_04_01_549424.csv start~
./data/TCS_TCS원시자료_1일_1일_20230502/TCS_17_04_01_511276.csv start~
./data/TCS_TCS원시자료_1일_1일_20230503/TCS_17_04_01_906155.csv start~
./data/TCS_TCS원시자료_1일_1일_20230504/TCS_17_04_01_845606.csv start~
./data/TCS_TCS원시자료_1일_1일_20230505/TCS_17_04_01_645771.csv start~
./data/TCS_TCS원시자료_1일_1일_20230506/TCS_17_04_01_810602.csv start~
./data/TCS_TCS원시자료_1일_1일_20230507/TCS_17_04_01_625537.csv start~
./data/TCS_TCS원시자료_1일_1일_20230508/TCS_17_04_01_975005.csv start~
./data/TCS_TCS원시자료_1일_1일_20230509/TCS_17_04_01_836542.csv start~
./data/TCS_TCS원시자료_1일_1일_20230510/TCS_17_04_01_448530.csv start~
./data/TCS_TCS원시자료_1일_1일_20230511/TCS_17_04_01_271156.csv start~
./data/TCS_TCS원시자료_1일_1일_20230512/TCS_17_04_01_125034.csv start~
./data/TCS_TCS원시자료_1일_1일_20230513/TCS_17_04_01_847956.csv start~
./data/TCS_TCS원시자료_1일_1일_20230514/TCS_17_04_01_145481.csv start~
./data/TCS_TCS원시자료_1일_1일_20230515/TCS_17_04_01_881374.csv start~
./data/TCS_TCS원시자료_1일_1일_20230516/TCS_17_04_01_539384.csv start~
./data/TCS_TCS원시자료_1일_1일_20230517/TCS_17_04_01_154911.csv start~
./data/TCS_TCS원시자료_1일_1일_20230518/TCS_17_04_01_990539.csv start~
./data/TCS_TCS원시자료_1일_1일_20230519/TCS_17_04_01_988952.csv start~
./data/TCS_TCS원시자료_1일_1일_20230520/TCS_17_04_01_104770.csv start~
./data/TCS_TCS원시자료_1일_1일_20230521/TCS_17_04_01_605243.csv start~
./data/TCS_TCS원시자료_1일_1일_20230522/TCS_17_04_01_273293.csv start~
./data/TCS_TCS원시자료_1일_1일_20230523/TCS_17_04_01_454634.csv start~
./data/TCS_TCS원시자료_1일_1일_20230524/TCS_17_04_01_134861.csv start~
./data/TCS_TCS원시자료_1일_1일_20230525/TCS_17_04_01_787377.csv start~
./data/TCS_TCS원시자료_1일_1일_20230526/TCS_17_04_01_926023.csv start~
./data/TCS_TCS원시자료_1일_1일_20230527/TCS_17_04_01_606250.csv start~
./data/TCS_TCS원시자료_1일_1일_20230528/TCS_17_04_01_772637.csv start~
./data/TCS_TCS원시자료_1일_1일_20230529/TCS_17_04_01_911282.csv start~
./data/TCS_TCS원시자료_1일_1일_20230530/TCS_17_04_01_563768.csv start~
./data/TCS_TCS원시자료_1일_1일_20230531/TCS_17_04_01_188543.csv start~
./data/TCS_TCS원시자료_1일_1일_20230601/TCS_17_04_01_299447.csv start~
./data/TCS_TCS원시자료_1일_1일_20230602/TCS_17_04_01_465834.csv start~
./data/TCS_TCS원시자료_1일_1일_20230603/TCS_17_04_01_632221.csv start~
./data/TCS_TCS원시자료_1일_1일_20230604/TCS_17_04_01_798608.csv start~
./data/TCS_TCS원시자료_1일_1일_20230605/TCS_17_04_01_937253.csv start~
./data/TCS_TCS원시자료_1일_1일_20230606/TCS_17_04_01_103579.csv start~
./data/TCS_TCS원시자료_1일_1일_20230607/TCS_17_04_01_214514.csv start~
./data/TCS_TCS원시자료_1일_1일_20230608/TCS_17_04_01_353160.csv start~
./data/TCS_TCS원시자료_1일_1일_20230609/TCS_17_04_01_491805.csv start~
./data/TCS_TCS원시자료_1일_1일_20230610/TCS_17_04_01_630451.csv start~
./data/TCS_TCS원시자료_1일_1일_20230611/TCS_17_04_01_769096.csv start~
./data/TCS_TCS원시자료_1일_1일_20230612/TCS_17_04_01_421582.csv start~
./data/TCS_TCS원시자료_1일_1일_20230613/TCS_17_04_01_532517.csv start~
./data/TCS_TCS원시자료_1일_1일_20230614/TCS_17_04_01_185003.csv start~
./data/TCS_TCS원시자료_1일_1일_20230615/TCS_17_04_01_295937.csv start~
./data/TCS_TCS원시자료_1일_1일_20230616/TCS_17_04_01_920712.csv start~
./data/TCS_TCS원시자료_1일_1일_20230617/TCS_17_04_01_545487.csv start~
./data/TCS_TCS원시자료_1일_1일_20230618/TCS_17_04_01_656422.csv start~
./data/TCS_TCS원시자료_1일_1일_20230619/TCS_17_04_01_767326.csv start~
./data/TCS_TCS원시자료_1일_1일_20230620/TCS_17_04_01_460341.csv start~
In [ ]:
# import pickle

# with open( './sample/sample.pickle', 'wb') as fwb:
#     pickle.dump( picking_data_files, fwb, protocol=pickle.HIGHEST_PROTOCOL )
In [ ]:
full_picking_data = pd.concat(picking_data_files,axis=0)
full_picking_data
Out[ ]:
출구본부명 출구지사명 출구영업소코드 출구영업소명 처리일자 처리일시분초 TCS차종구분코드 TCS차종구분명 근무일자 근무번호 확인순번 TCS본부명 지사명 영업소코드 영업소명 발급일시 발급시분초 Unnamed: 17
104 수도권본부 시흥 253 서서울 20230322 74831 6 6종 20230322 4602 117 수도권본부 군포 256 안산 322 74100 NaN
153 수도권본부 시흥 253 서서울 20230322 103844 1 1종 20230322 4001 405 대전충남본부 당진 288 당진 20230322 95800 NaN
423 수도권본부 시흥 253 서서울 20230322 84252 1 1종 20230322 4001 200 수도권본부 군포 102 동수원 20230322 82600 NaN
435 수도권본부 시흥 253 서서울 20230322 84806 6 6종 20230322 4602 174 수도권본부 화성 282 비봉 322 83600 NaN
452 수도권본부 시흥 253 서서울 20230322 85331 4 4종 20230322 3801 172 강원본부 이천 173 양지 322 80100 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
352065 대전충남본부 천안 108 천안 20230620 133405 1 1종 20230620 3501 542 수도권본부 수원 103 수원신갈 20230620 125000 NaN
352066 대전충남본부 천안 108 천안 20230620 133423 1 1종 20230620 3501 543 수도권본부 수원 742 남사진위 620 123600 NaN
352067 대전충남본부 천안 108 천안 20230620 133611 1 1종 20230620 3501 544 수도권본부 군포 102 동수원 620 125800 NaN
352068 대전충남본부 천안 108 천안 20230620 133633 1 1종 20230620 3501 545 수도권본부 군포 254 군자 620 114400 NaN
352069 대전충남본부 천안 108 천안 20230620 133716 1 1종 20230620 3501 546 대전충남본부 천안 107 안성 20230620 130000 NaN

4846621 rows × 18 columns

In [ ]:
full_picking_data.shape
Out[ ]:
(4846621, 18)
In [ ]:
full_picking_data.columns
Out[ ]:
Index(['출구본부명', '출구지사명', '출구영업소코드', '출구영업소명', '처리일자', '처리일시분초', 'TCS차종구분코드',
       'TCS차종구분명', '근무일자', '근무번호', '확인순번', 'TCS본부명', '지사명', '영업소코드', '영업소명',
       '발급일시', '발급시분초', 'Unnamed: 17'],
      dtype='object')
In [ ]:
full_picking_data[[ '출구영업소명',  '처리일자', '처리일시분초']]
Out[ ]:
출구영업소명 처리일자 처리일시분초
104 서서울 20230322 074831
153 서서울 20230322 103844
423 서서울 20230322 084252
435 서서울 20230322 084806
452 서서울 20230322 085331
... ... ... ...
352065 천안 20230620 133405
352066 천안 20230620 133423
352067 천안 20230620 133611
352068 천안 20230620 133633
352069 천안 20230620 133716

4846621 rows × 3 columns

In [ ]:
len(full_picking_data['처리일자'].unique())
Out[ ]:
91
In [ ]:
full_picking_data['처리일자'] = full_picking_data['처리일자'].astype(str)
full_picking_data['처리일시분초'] = full_picking_data['처리일시분초'].astype(str)
full_picking_data['처리일시분초'] = full_picking_data['처리일시분초'].apply( lambda x: x.zfill(6) )
full_picking_data['ds'] = full_picking_data[['처리일자', '처리일시분초']].apply( ' '.join, axis=1 )
full_picking_data['ds'] = pd.to_datetime(full_picking_data.ds)

full_picking_data[[ '출구영업소명', 'ds']]
Out[ ]:
출구영업소명 ds
104 서서울 2023-03-22 07:48:31
153 서서울 2023-03-22 10:38:44
423 서서울 2023-03-22 08:42:52
435 서서울 2023-03-22 08:48:06
452 서서울 2023-03-22 08:53:31
... ... ...
352065 천안 2023-06-20 13:34:05
352066 천안 2023-06-20 13:34:23
352067 천안 2023-06-20 13:36:11
352068 천안 2023-06-20 13:36:33
352069 천안 2023-06-20 13:37:16

4846621 rows × 2 columns

In [ ]:
compat_data = full_picking_data[[ '출구영업소명', 'ds']]
compat_data['y'] = 1
compat_data.columns = [ 'user', 'ds', 'y']
compat_data.sort_values('ds', inplace=True)
compat_data.reset_index( drop=True, inplace=True)
compat_data
/tmp/ipykernel_13028/174773155.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compat_data['y'] = 1
/tmp/ipykernel_13028/174773155.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compat_data.sort_values('ds', inplace=True)
Out[ ]:
user ds y
0 북대구 2023-03-22 00:00:06 1
1 서대구 2023-03-22 00:00:10 1
2 동서울 2023-03-22 00:00:15 1
3 군자 2023-03-22 00:00:19 1
4 서울 2023-03-22 00:00:21 1
... ... ... ...
4846616 서울 2023-06-20 23:59:43 1
4846617 서서울 2023-06-20 23:59:45 1
4846618 동서울 2023-06-20 23:59:45 1
4846619 군자 2023-06-20 23:59:54 1
4846620 동서울 2023-06-20 23:59:54 1

4846621 rows × 3 columns

In [ ]:
# import pickle

# with open( './sample/top10users.pickle', 'wb') as fwb:
#     pickle.dump( compat_data, fwb, protocol=pickle.HIGHEST_PROTOCOL )
In [ ]:
import pickle

with open( './sample/top10users.pickle', 'rb') as frb:
    compat_data = pickle.load( frb  )
In [ ]:
smp_data = compat_data[compat_data.user == '서서울'][['ds','y']].sort_values('ds').reset_index( drop=True)
smp_data
Out[ ]:
ds y
0 2023-03-22 00:01:25 1
1 2023-03-22 00:01:42 1
2 2023-03-22 00:02:04 1
3 2023-03-22 00:03:39 1
4 2023-03-22 00:04:33 1
... ... ...
739969 2023-06-20 23:58:27 1
739970 2023-06-20 23:58:51 1
739971 2023-06-20 23:59:03 1
739972 2023-06-20 23:59:13 1
739973 2023-06-20 23:59:45 1

739974 rows × 2 columns

각 user 최대값 확인¶

In [ ]:
users = ['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안']

for idx, user in enumerate(users):
    print(
        f'''
        user : {idx} - {user}
        max : {compat_data[compat_data.user == user][['ds','y']].sort_values('ds').set_index('ds').resample( '1T' ).count().reset_index()['y'].max()}
        '''
    )
    # compat_data[compat_data.user == user][['ds','y']].sort_values('ds').set_index('ds').resample( '1T' ).count().reset_index()['y'].max()
        user : 0 - 서서울
        max : 20
        

        user : 1 - 서울
        max : 19
        

        user : 2 - 대동
        max : 18
        

        user : 3 - 동서울
        max : 15
        

        user : 4 - 군자
        max : 14
        

        user : 5 - 북평택
        max : 15
        

        user : 6 - 북대구
        max : 14
        

        user : 7 - 서시흥
        max : 11
        

        user : 8 - 서대구
        max : 14
        

        user : 9 - 천안
        max : 13
        
In [ ]:
temp_data = smp_data[['ds', 'y']].set_index('ds').resample( '1T' ).count().reset_index()
px.line( temp_data, x='ds', y='y', title='Train data' ).show()
px.scatter( temp_data, x='ds', y='y', title='Train data' ).show()

End of File¶